1 Proteins of Interest

proteins_of_interest <- c(
  "PDCD4",
  "LGALS1",
  "LGALS3",
  "LGALSL", #replaced galectin by the gene name
  "CCL17",
  "BCCIP",
  "CDC42",
  "CDC37",
  "CDC38",
  "CCD86",
  "LPXN",
  "PAX",
  "CYTB",
  "SWI/SNF",
  "NFKB1",
  "TNFAIP8",
  "ST13",
  "NEUA",
  "CYC1",
  "PDCD5",
  "PDCD6",
  "PDCD6IP",
  "SERPINB6",
  "CD63",
  "CD70",
  "LCA",
  "NUDC",
  "BAG6",
  "BCLAF1",
  "JAK",
  "STAT6",
  "STAT3",
  "STAT5A",
  "STAT1",
  "NUDT5",
  "PRCP",
  "SMARCC2",
  "BANF1",
  "BAF", # added baf because BANF1 = BAF is not possible
  "MANF",
  "NENF",
  "DDB1",
  "ARMT1",
  "FEN1",
  "APEX1",
  "PAFAH1B1",
  "EEF1E1",
  "HDGF",
  "GRB2",
  "MYDGF",
  "OGFR",
  "CCAR1",
  "CCAR2",
  "AIFM1",
  "API5",
  "ACIN1",
  "RBM4",
  "SDHB",
  "PURA",
  "SDHA",
  "DLST",
  "SUCLG1",
  "OXCT1",
  "OGDH",
  "SUCLG2",
  "SUCLA2",
  "ALDH",
  "SOD2",
  "MCM2",
  "ACY1",
  "GATM",
  "HEATR",
  "NUP210",
  "NUP214",
  "VDAC1",
  "VDAC2",
  "VDAC3",
  "BCCP",
  "HK2",
  "HK1",
  "PARP1",
  "PARP4",
  "H2AFX",
  "PCYT1A",
  "HSPA1B",
  "CASP3",
  "SLC4A7",
  "EPS15L1",
  "KCNAB2",
  "TNFRSF8",
  "CCBL2",
  "GAR1"
)

2 Load Libraries & Plot function & Colors

3 Functions

############################################################
# Function: run_deqms_pipeline
# Purpose:  Fit limma model, apply DEqMS (spectraCounteBayes),
#           and show the variance boxplot.
############################################################

run_deqms_pipeline <- function(protein.matrix, design, pep.count.table,
                               contrast_string = "classC-classE",
                               main_title = "Label-free dataset") {
  
  # Step 1: Fit linear model with limma
  fit1 <- limma::lmFit(protein.matrix, design = design)
  
  # Step 2: Define contrasts
  cont <- limma::makeContrasts(contrasts = contrast_string, levels = design)
  fit2 <- limma::contrasts.fit(fit1, contrasts = cont)
  
  # Step 3: Empirical Bayes shrinkage
  fit3 <- limma::eBayes(fit2)
  fit3$sigma[which(fit3$sigma == 0)] <- 1e-14
  
  # Step 4: Attach peptide counts
  fit3$count <- pep.count.table[rownames(fit3$coefficients), "count"]
  
  # Validate peptide count input
  if (any(is.na(fit3$count)) || min(fit3$count, na.rm = TRUE) <= 0) {
    stop("Error: At least one entry in 'pep.count.table$count' is NA or <= 0. Please verify inputs.")
  }
  
  # Step 5: DEqMS modeling
  fit4 <- DEqMS::spectraCounteBayes(fit3)
  
  # Step 6: Visualization
  DEqMS::VarianceBoxplot(fit4, n = 20,
                         main = main_title,
                         xlab = "peptide count + 1")
  
  return(fit4)
}


############################################################
# Function: plot_volcano
# Purpose:  Create a volcano plot using log2 fold change and q-values
############################################################

plot_volcano <- function(res, title = "", lfc_limit = NA) {
  res.plot <- res %>%
    dplyr::filter(!is.na(adj.P.Val)) %>%
    dplyr::arrange(adj.P.Val) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      threshold = adj.P.Val < 0.05 & abs(logFC) > 1.0,
      out_of_bounds = ifelse(is.na(lfc_limit), 0, (abs(logFC) > lfc_limit) * sign(logFC)),
      logFC_capped = ifelse(out_of_bounds != 0, lfc_limit * sign(logFC), logFC)
    ) %>%
    dplyr::ungroup()
  
  res.plot %<>% 
    dplyr::mutate(genelabels = ifelse(threshold, gene, ""))
  
  maxFC <- max(abs(res.plot$logFC))
  if (!is.na(lfc_limit) && lfc_limit < maxFC) {
    xlim <- lfc_limit
  } else {
    xlim <- maxFC * 1.04
  }
  
  ggplot2::ggplot(res.plot, ggplot2::aes(x = logFC_capped, y = adj.P.Val)) +
    ggplot2::geom_point(data = subset(res.plot, out_of_bounds == 0),
                        ggplot2::aes(colour = threshold), alpha = 0.5) +
    ggplot2::geom_point(data = subset(res.plot, out_of_bounds == -1),
                        ggplot2::aes(colour = threshold), shape = "\u25c4", size = 2) +
    ggplot2::geom_point(data = subset(res.plot, out_of_bounds == 1),
                        ggplot2::aes(colour = threshold), shape = "\u25BA", size = 2) +
    ggplot2::geom_hline(yintercept = 0.05, linetype = 2) +
    ggplot2::geom_vline(xintercept = c(-1, 1), linetype = 2) +
    ggplot2::scale_x_continuous(breaks = scales::pretty_breaks(),
                                limits = c(-xlim, xlim),
                                expand = ggplot2::expansion(0.01)) +
    ggplot2::scale_y_continuous(trans = c("log10", "reverse"),
                                breaks = scales::log_breaks(),
                                labels = scales::scientific) +
    ggplot2::ggtitle(title) +
    ggplot2::xlab("log2 Fold Change") +
    ggplot2::ylab("q-value") +
    ggplot2::theme_minimal() +
    ggplot2::theme(legend.position = "none")
}


############################################################
# Function: extract_gene_names
# Purpose:  Parse FASTA headers and extract gene names (GN=... entries)
############################################################

extract_gene_names <- function(fasta_headers) {
  sapply(fasta_headers, function(x) {
    if (is.na(x) || x == "") return(NA_character_)  # Handle empty or NA lines
    
    # Find all GN=... matches
    genes <- unlist(regmatches(x, gregexpr("GN=([^ ]+)", x)))
    if (length(genes) == 0) return(NA_character_)  # No GN found
    
    # Remove "GN=" prefix
    genes <- gsub("GN=", "", genes)
    
    # Remove duplicates and collapse multiple entries
    paste(unique(genes), collapse = ";")
  })
}

4 C vs E

############################################################
# C vs E
############################################################

# Load MaxQuant output
df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df

# Remove reverse/contaminant/site-only identifications
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

# Extract gene names using helper
df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

# Extract LFQ intensity columns (as provided)
df.LFQ <- df.prot[, c(260:262, 266:271, 272:277, 278:283)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_CR_C1_A_01.raw.thermo.raw"
##  [2] "LFQ.intensity.HL_540_CR_C1_A_02.raw.thermo.raw"
##  [3] "LFQ.intensity.HL_540_CR_C1_A_03.raw.thermo.raw"
##  [4] "LFQ.intensity.HL_540_CR_C2_A_01.raw.thermo.raw"
##  [5] "LFQ.intensity.HL_540_CR_C2_A_02.raw.thermo.raw"
##  [6] "LFQ.intensity.HL_540_CR_C2_A_03.raw.thermo.raw"
##  [7] "LFQ.intensity.HL_540_CR_C2_B_01.raw.thermo.raw"
##  [8] "LFQ.intensity.HL_540_CR_C2_B_02.raw.thermo.raw"
##  [9] "LFQ.intensity.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_E1_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_E1_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_E1_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_E1_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_E1_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_E1_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_E2_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_E2_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_E2_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_E2_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_E2_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_E2_B_03.raw.thermo.raw"
# Set protein IDs as row names
rownames(df.LFQ) <- df.prot$Majority.protein.IDs

# Transpose: rows = samples, cols = proteins
df.LFQ <- t(df.LFQ)

# Track columns that are entirely NA (align with peptide-count construction)
keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

# Remove columns that are all NA
df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]

# Log2 transform
df.LFQ.clean <- log2(df.LFQ.clean)

# Missing-value imputation (local least squares)
result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

# Valid observations per group
df.LFQ$valid_A <- apply(df.LFQ[, 1:9], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[,10:21], 1, function(x) sum(!is.na(x)))

# Protein filter by minimal presence
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:21]

# Inspect peptide-count columns for these samples
print(colnames(df.prot[, c(59:61, 65:70, 71:76, 77:82)]))
##  [1] "Razor...unique.peptides.HL_540_CR_C1_A_01.raw.thermo.raw"
##  [2] "Razor...unique.peptides.HL_540_CR_C1_A_02.raw.thermo.raw"
##  [3] "Razor...unique.peptides.HL_540_CR_C1_A_03.raw.thermo.raw"
##  [4] "Razor...unique.peptides.HL_540_CR_C2_A_01.raw.thermo.raw"
##  [5] "Razor...unique.peptides.HL_540_CR_C2_A_02.raw.thermo.raw"
##  [6] "Razor...unique.peptides.HL_540_CR_C2_A_03.raw.thermo.raw"
##  [7] "Razor...unique.peptides.HL_540_CR_C2_B_01.raw.thermo.raw"
##  [8] "Razor...unique.peptides.HL_540_CR_C2_B_02.raw.thermo.raw"
##  [9] "Razor...unique.peptides.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_E1_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_E1_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_E1_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_E1_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_E1_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_E1_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_E2_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_E2_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_E2_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_E2_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_E2_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_E2_B_03.raw.thermo.raw"
# Build peptide-count table (unique+razor peptides, min across samples)
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(59:61, 65:70, 71:76, 77:82)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
# Add pseudocount to avoid zeros
pep.count.table$count <- pep.count.table$count + 1

# Matrix for limma/DEqMS
protein.matrix <- as.matrix(df.LFQ.filter)

# Factors for class and replicate (batch)
class     <- factor(c(rep("C", 9),  rep("E", 12)))
replicate <- factor(c(rep("rep1", 3), rep("rep2", 6),
                      rep("rep1", 6), rep("rep2", 6)))

# Design without intercept; replicate as covariate
design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classE")

# Optional: contrast object (not strictly needed since run_deqms_pipeline uses contrast_string)
cont <- limma::makeContrasts(classC - classE, levels = design)

# Run pipeline
fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classC-classE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.6234e-16

# Collect results
DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

# Export with timestamp
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))

# Process: remove NA logFC and round numeric columns
processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
# Proteins of interest (assumes 'proteins_of_interest' is defined upstream)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
# Volcano plot
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs E", vp_lfc_limit)

# Counts for strongly regulated proteins
sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 155
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 132
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 287

5 C vs RE

############################################################
# C vs RE
############################################################

df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(260:262, 266:271, 296:301, 302:307)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_CR_C1_A_01.raw.thermo.raw" 
##  [2] "LFQ.intensity.HL_540_CR_C1_A_02.raw.thermo.raw" 
##  [3] "LFQ.intensity.HL_540_CR_C1_A_03.raw.thermo.raw" 
##  [4] "LFQ.intensity.HL_540_CR_C2_A_01.raw.thermo.raw" 
##  [5] "LFQ.intensity.HL_540_CR_C2_A_02.raw.thermo.raw" 
##  [6] "LFQ.intensity.HL_540_CR_C2_A_03.raw.thermo.raw" 
##  [7] "LFQ.intensity.HL_540_CR_C2_B_01.raw.thermo.raw" 
##  [8] "LFQ.intensity.HL_540_CR_C2_B_02.raw.thermo.raw" 
##  [9] "LFQ.intensity.HL_540_CR_C2_B_03.raw.thermo.raw" 
## [10] "LFQ.intensity.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:9], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 10:21], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:21]

print(colnames(df.prot[, c(59:61, 65:70, 95:100, 101:106)]))
##  [1] "Razor...unique.peptides.HL_540_CR_C1_A_01.raw.thermo.raw" 
##  [2] "Razor...unique.peptides.HL_540_CR_C1_A_02.raw.thermo.raw" 
##  [3] "Razor...unique.peptides.HL_540_CR_C1_A_03.raw.thermo.raw" 
##  [4] "Razor...unique.peptides.HL_540_CR_C2_A_01.raw.thermo.raw" 
##  [5] "Razor...unique.peptides.HL_540_CR_C2_A_02.raw.thermo.raw" 
##  [6] "Razor...unique.peptides.HL_540_CR_C2_A_03.raw.thermo.raw" 
##  [7] "Razor...unique.peptides.HL_540_CR_C2_B_01.raw.thermo.raw" 
##  [8] "Razor...unique.peptides.HL_540_CR_C2_B_02.raw.thermo.raw" 
##  [9] "Razor...unique.peptides.HL_540_CR_C2_B_03.raw.thermo.raw" 
## [10] "Razor...unique.peptides.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(59:61, 65:70, 95:100, 101:106)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("C", 9), rep("RE", 12)))
replicate <- factor(c(rep("rep1", 3), rep("rep2", 6),
                      rep("rep1", 6), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classRE")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classC-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.048e-16

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 228
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 162
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 390

6 C vs R

############################################################
# C vs R
############################################################

df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(260:262, 266:271, 284:289, 290:295)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_CR_C1_A_01.raw.thermo.raw"
##  [2] "LFQ.intensity.HL_540_CR_C1_A_02.raw.thermo.raw"
##  [3] "LFQ.intensity.HL_540_CR_C1_A_03.raw.thermo.raw"
##  [4] "LFQ.intensity.HL_540_CR_C2_A_01.raw.thermo.raw"
##  [5] "LFQ.intensity.HL_540_CR_C2_A_02.raw.thermo.raw"
##  [6] "LFQ.intensity.HL_540_CR_C2_A_03.raw.thermo.raw"
##  [7] "LFQ.intensity.HL_540_CR_C2_B_01.raw.thermo.raw"
##  [8] "LFQ.intensity.HL_540_CR_C2_B_02.raw.thermo.raw"
##  [9] "LFQ.intensity.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_R1_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_R1_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_R1_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_R1_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_R1_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_R1_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_R2_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_R2_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_R2_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_R2_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_R2_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_R2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:9], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 10:21], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:21]

print(colnames(df.prot[, c(59:61, 65:70, 83:88, 89:94)]))
##  [1] "Razor...unique.peptides.HL_540_CR_C1_A_01.raw.thermo.raw"
##  [2] "Razor...unique.peptides.HL_540_CR_C1_A_02.raw.thermo.raw"
##  [3] "Razor...unique.peptides.HL_540_CR_C1_A_03.raw.thermo.raw"
##  [4] "Razor...unique.peptides.HL_540_CR_C2_A_01.raw.thermo.raw"
##  [5] "Razor...unique.peptides.HL_540_CR_C2_A_02.raw.thermo.raw"
##  [6] "Razor...unique.peptides.HL_540_CR_C2_A_03.raw.thermo.raw"
##  [7] "Razor...unique.peptides.HL_540_CR_C2_B_01.raw.thermo.raw"
##  [8] "Razor...unique.peptides.HL_540_CR_C2_B_02.raw.thermo.raw"
##  [9] "Razor...unique.peptides.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_R1_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_R1_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_R1_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_R1_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_R1_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_R1_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_R2_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_R2_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_R2_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_R2_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_R2_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_R2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(59:61, 65:70, 83:88, 89:94)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("C", 9), rep("R", 12)))
replicate <- factor(c(rep("rep1", 3), rep("rep2", 6),
                      rep("rep1", 6), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classR")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classC-classR"
)

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs R", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 15
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 21
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 36

7 E vs R

############################################################
# E vs R
############################################################

df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(272:277, 278:283, 284:289, 290:295)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_CR_E1_A_01.raw.thermo.raw"
##  [2] "LFQ.intensity.HL_540_CR_E1_A_02.raw.thermo.raw"
##  [3] "LFQ.intensity.HL_540_CR_E1_A_03.raw.thermo.raw"
##  [4] "LFQ.intensity.HL_540_CR_E1_B_01.raw.thermo.raw"
##  [5] "LFQ.intensity.HL_540_CR_E1_B_02.raw.thermo.raw"
##  [6] "LFQ.intensity.HL_540_CR_E1_B_03.raw.thermo.raw"
##  [7] "LFQ.intensity.HL_540_CR_E2_A_01.raw.thermo.raw"
##  [8] "LFQ.intensity.HL_540_CR_E2_A_02.raw.thermo.raw"
##  [9] "LFQ.intensity.HL_540_CR_E2_A_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_E2_B_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_E2_B_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_E2_B_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_R1_A_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_R1_A_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_R1_A_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_R1_B_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_R1_B_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_R1_B_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_R2_A_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_R2_A_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_R2_A_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_CR_R2_B_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_CR_R2_B_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_CR_R2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:12], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 13:24], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:24]

print(colnames(df.prot[, c(71:76, 77:82, 83:88, 89:94)]))
##  [1] "Razor...unique.peptides.HL_540_CR_E1_A_01.raw.thermo.raw"
##  [2] "Razor...unique.peptides.HL_540_CR_E1_A_02.raw.thermo.raw"
##  [3] "Razor...unique.peptides.HL_540_CR_E1_A_03.raw.thermo.raw"
##  [4] "Razor...unique.peptides.HL_540_CR_E1_B_01.raw.thermo.raw"
##  [5] "Razor...unique.peptides.HL_540_CR_E1_B_02.raw.thermo.raw"
##  [6] "Razor...unique.peptides.HL_540_CR_E1_B_03.raw.thermo.raw"
##  [7] "Razor...unique.peptides.HL_540_CR_E2_A_01.raw.thermo.raw"
##  [8] "Razor...unique.peptides.HL_540_CR_E2_A_02.raw.thermo.raw"
##  [9] "Razor...unique.peptides.HL_540_CR_E2_A_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_E2_B_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_E2_B_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_E2_B_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_R1_A_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_R1_A_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_R1_A_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_R1_B_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_R1_B_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_R1_B_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_R2_A_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_R2_A_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_R2_A_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_CR_R2_B_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_CR_R2_B_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_CR_R2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(71:76, 77:82, 83:88, 89:94)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("E", 12), rep("R", 12)))
replicate <- factor(c(rep("rep1", 6), rep("rep2", 6),
                      rep("rep1", 6), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classE", "classR")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classE-classR"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 9.4178e-17

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "E vs R", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 80
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 119
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 199

8 E vs RE

############################################################
# E vs RE
############################################################
df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(272:277, 278:283, 296:301, 302:307)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_CR_E1_A_01.raw.thermo.raw" 
##  [2] "LFQ.intensity.HL_540_CR_E1_A_02.raw.thermo.raw" 
##  [3] "LFQ.intensity.HL_540_CR_E1_A_03.raw.thermo.raw" 
##  [4] "LFQ.intensity.HL_540_CR_E1_B_01.raw.thermo.raw" 
##  [5] "LFQ.intensity.HL_540_CR_E1_B_02.raw.thermo.raw" 
##  [6] "LFQ.intensity.HL_540_CR_E1_B_03.raw.thermo.raw" 
##  [7] "LFQ.intensity.HL_540_CR_E2_A_01.raw.thermo.raw" 
##  [8] "LFQ.intensity.HL_540_CR_E2_A_02.raw.thermo.raw" 
##  [9] "LFQ.intensity.HL_540_CR_E2_A_03.raw.thermo.raw" 
## [10] "LFQ.intensity.HL_540_CR_E2_B_01.raw.thermo.raw" 
## [11] "LFQ.intensity.HL_540_CR_E2_B_02.raw.thermo.raw" 
## [12] "LFQ.intensity.HL_540_CR_E2_B_03.raw.thermo.raw" 
## [13] "LFQ.intensity.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_CR_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:12], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 13:24], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:24]

print(colnames(df.prot[, c(71:76, 77:82, 95:100, 101:106)]))
##  [1] "Razor...unique.peptides.HL_540_CR_E1_A_01.raw.thermo.raw" 
##  [2] "Razor...unique.peptides.HL_540_CR_E1_A_02.raw.thermo.raw" 
##  [3] "Razor...unique.peptides.HL_540_CR_E1_A_03.raw.thermo.raw" 
##  [4] "Razor...unique.peptides.HL_540_CR_E1_B_01.raw.thermo.raw" 
##  [5] "Razor...unique.peptides.HL_540_CR_E1_B_02.raw.thermo.raw" 
##  [6] "Razor...unique.peptides.HL_540_CR_E1_B_03.raw.thermo.raw" 
##  [7] "Razor...unique.peptides.HL_540_CR_E2_A_01.raw.thermo.raw" 
##  [8] "Razor...unique.peptides.HL_540_CR_E2_A_02.raw.thermo.raw" 
##  [9] "Razor...unique.peptides.HL_540_CR_E2_A_03.raw.thermo.raw" 
## [10] "Razor...unique.peptides.HL_540_CR_E2_B_01.raw.thermo.raw" 
## [11] "Razor...unique.peptides.HL_540_CR_E2_B_02.raw.thermo.raw" 
## [12] "Razor...unique.peptides.HL_540_CR_E2_B_03.raw.thermo.raw" 
## [13] "Razor...unique.peptides.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_CR_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(71:76, 77:82, 95:100, 101:106)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("E", 12), rep("RE", 12)))
replicate <- factor(c(rep("rep1", 6), rep("rep2", 6),
                      rep("rep1", 6), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classE", "classRE")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classE-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 9.9968e-17

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "E vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 10
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 11
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 21

9 R vs RE

############################################################
# R vs RE
############################################################

df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]

df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)

df.LFQ <- df.prot[, c(284:289, 290:295, 296:301, 302:307)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
##  [1] "LFQ.intensity.HL_540_CR_R1_A_01.raw.thermo.raw" 
##  [2] "LFQ.intensity.HL_540_CR_R1_A_02.raw.thermo.raw" 
##  [3] "LFQ.intensity.HL_540_CR_R1_A_03.raw.thermo.raw" 
##  [4] "LFQ.intensity.HL_540_CR_R1_B_01.raw.thermo.raw" 
##  [5] "LFQ.intensity.HL_540_CR_R1_B_02.raw.thermo.raw" 
##  [6] "LFQ.intensity.HL_540_CR_R1_B_03.raw.thermo.raw" 
##  [7] "LFQ.intensity.HL_540_CR_R2_A_01.raw.thermo.raw" 
##  [8] "LFQ.intensity.HL_540_CR_R2_A_02.raw.thermo.raw" 
##  [9] "LFQ.intensity.HL_540_CR_R2_A_03.raw.thermo.raw" 
## [10] "LFQ.intensity.HL_540_CR_R2_B_01.raw.thermo.raw" 
## [11] "LFQ.intensity.HL_540_CR_R2_B_02.raw.thermo.raw" 
## [12] "LFQ.intensity.HL_540_CR_R2_B_03.raw.thermo.raw" 
## [13] "LFQ.intensity.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_CR_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)

keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)

df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)

result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))

df.LFQ$valid_A <- apply(df.LFQ[, 1:12], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 13:24], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:24]

print(colnames(df.prot[, c(83:88, 89:94, 95:100, 101:106)]))
##  [1] "Razor...unique.peptides.HL_540_CR_R1_A_01.raw.thermo.raw" 
##  [2] "Razor...unique.peptides.HL_540_CR_R1_A_02.raw.thermo.raw" 
##  [3] "Razor...unique.peptides.HL_540_CR_R1_A_03.raw.thermo.raw" 
##  [4] "Razor...unique.peptides.HL_540_CR_R1_B_01.raw.thermo.raw" 
##  [5] "Razor...unique.peptides.HL_540_CR_R1_B_02.raw.thermo.raw" 
##  [6] "Razor...unique.peptides.HL_540_CR_R1_B_03.raw.thermo.raw" 
##  [7] "Razor...unique.peptides.HL_540_CR_R2_A_01.raw.thermo.raw" 
##  [8] "Razor...unique.peptides.HL_540_CR_R2_A_02.raw.thermo.raw" 
##  [9] "Razor...unique.peptides.HL_540_CR_R2_A_03.raw.thermo.raw" 
## [10] "Razor...unique.peptides.HL_540_CR_R2_B_01.raw.thermo.raw" 
## [11] "Razor...unique.peptides.HL_540_CR_R2_B_02.raw.thermo.raw" 
## [12] "Razor...unique.peptides.HL_540_CR_R2_B_03.raw.thermo.raw" 
## [13] "Razor...unique.peptides.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_CR_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
  count     = rowMins(as.matrix(df.prot[-removed_col_indices, c(83:88, 89:94, 95:100, 101:106)])),
  row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1

protein.matrix <- as.matrix(df.LFQ.filter)

class     <- factor(c(rep("E", 12), rep("RE", 12)))  # (as in your original block)
replicate <- factor(c(rep("rep1", 6), rep("rep2", 6),
                      rep("rep1", 6), rep("rep2", 6)))

design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classE", "classRE")

fit_result <- run_deqms_pipeline(
  protein.matrix, design, pep.count.table,
  contrast_string = "classE-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 7.8876e-17

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names

timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
          file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))

processed_df <- DEqMS.results %>%
  dplyr::filter(!is.na(logFC)) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
  dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "R vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up    <- sum(sig$logFC >=  1.0, na.rm = TRUE)
n_down  <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 196
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 124
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 320